Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages ------------------------------------------------------------------------ tidyverse 1.3.1 --
v tibble 3.1.1 v dplyr 1.0.5
v tidyr 1.1.3 v stringr 1.4.0
v readr 1.4.0 v forcats 0.5.1
v purrr 0.3.4
-- Conflicts --------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
Attaching package: 㤼㸱lubridate㤼㸲
The following objects are masked from 㤼㸱package:base㤼㸲:
date, intersect, setdiff, union
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Loading required package: Matrix
Attaching package: 㤼㸱Matrix㤼㸲
The following objects are masked from 㤼㸱package:tidyr㤼㸲:
expand, pack, unpack
Attaching package: 㤼㸱arules㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
recode
The following objects are masked from 㤼㸱package:base㤼㸲:
abbreviate, write
df_artist <- read.csv("data/df_artist_sin_duplicados.csv")
df_charts_raw <- read.csv("data/df_charts_sin_duplicados.csv")
df_audio_features_raw <- read.csv("data/audio_features_plano_sin_duplicados.csv")
df_lyrics <- read.csv("data/df_lyrics.csv")
# DF listo para el join con chrats
df_audio_features <- df_audio_features_raw %>%
group_by(track_name, external_urls_spotify) %>%
mutate(artist_all = paste(artist_name, collapse = ",|,")) %>%
ungroup() %>%
mutate(artist_key = sub(",|,.*", "", artist_all)) %>%
dplyr::select(artist_name, artist_all, artist_key, everything(.)) %>%
distinct(artist_key, external_urls_spotify, .keep_all = T) %>%
as.data.frame()
cant_marketscontar_market <- function(x){
q <- length(unlist(strsplit(x, split = ",")))
return (q)
}
df_audio_features$cant_markets <- sapply(df_audio_features[,"markets_concat"], contar_market)
ggplot(data = melt(df_charts[, cols]), aes(value))+
geom_histogram()+
facet_wrap(~variable, scales = "free")
No id variables; using all as measure variables
audio_features Y chartsmd.pattern(join_audio_charts, rotate.names = TRUE)
artist_key track_name external_urls_spotify semanas_sum streams_sum streams_min
1975 1 1 1 1 1 1
2 1 1 1 1 1 1
1326 1 1 1 1 1 1
0 0 0 0 0 0
streams_max position_avg position_min position_max popularidad artist_name
1975 1 1 1 1 1 1
2 1 1 1 1 1 1
1326 1 1 1 1 1 0
0 0 0 0 0 1326
artist_all album_name acousticness danceability duration_ms energy instrumentalness
1975 1 1 1 1 1 1 1
2 1 1 1 1 1 1 1
1326 0 0 0 0 0 0 0
1326 1326 1326 1326 1326 1326 1326
liveness loudness speechiness tempo valence cant_markets explicit key_name
1975 1 1 1 1 1 1 1 1
2 1 1 1 1 1 1 1 1
1326 0 0 0 0 0 0 0 0
1326 1326 1326 1326 1326 1326 1326 1326
mode_name key_mode album_release_year
1975 1 1 1 0
2 1 1 0 1
1326 0 0 0 19
1326 1326 1328 25196
par(mfrow=c(4,3))
for (feature in features_continuas){
boxplot(df_audio_features[,feature], las=2, horizontal=T, main=feature)
}
Con excepción de valence el resto de las features poseían cierto sesgo. Se decidió transformar las variables que mayor sesgo poseían: duration_ms, instrumentalness, liveness, speechiness como método de corregir la distribución y achicar la cantidad de outliers. La variable loudness_reg_imp no fue modificada debido a que al ser negativa
#histogramas
ggplot(melt(df_sesgadas_log_adjust), aes(value))+
geom_histogram()+
facet_wrap(~variable)
No id variables; using all as measure variables
#discretizacion
df_ly_feat_ok$cat_danceability = cut(df_ly_feat_ok$danceability,
breaks = c(0.4,0.7,0.8, 1),#quantile(df_feat$danceability),
labels = c("Baja","Media", "Alta"))
variables_plot <- unlist(strsplit("duration_ms", ","))
variables_plot <- append(variables_plot, paste(variables_plot,"_log", sep=""))
variables_plot <- variables_plot[order(variables_plot)]
plotear <- merged[,variables_plot]
par(mfrow = c(1,2))
for (col in names(plotear)){
hist(plotear[,col], breaks="FD", main=col, xlab="")
}
transformacion <- c('instrumentalness','loudness','liveness','speechiness', 'duration_ms')
logaritmo_ajustado = function(x,delta){
if (x<=0.0){
return(log(0.00+delta, base = 10))
}else{
return(log(x, base = 10))
}
}
delta <- 10^(-6)
par(mfrow=c(2,5))
for (feature in transformacion){
hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
hist(unlist(lapply(df_audio_features[,feature], function(x) logaritmo_ajustado(x,delta))), main=paste(feature,"log", sep="_"))
}
inv_sqrt_ajustada = function(x, delta){
if (x==0.0){
return(1/sqrt(x+delta))
}else{
return(1/sqrt(x))
}
}
delta <- 10^(-6)
par(mfrow=c(2,5))
for (feature in transformacion){
hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
hist(unlist(lapply(df_audio_features[,feature], function(x) inv_sqrt_ajustada(x,delta))), main=paste(feature,"inv_sqt", sep="_"))
}
par(mfrow=c(2,5))
for (feature in transformacion){
hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
hist(sqrt(df_audio_features[,feature]), main=paste(feature,"sqrt", sep="_"))
}
par(mfrow = c(2,1))
hist(df_audio_features[,'loudness_reg_imp'], main='loudness', xlab="")
#hist(sqrt(df_audio_features[,'loudness_reg_imp']), main= 'loudness_sqrt', xlab="")
boxplot(df_audio_features[,'loudness_reg_imp'], horizontal = T)
#boxplot(sqrt(df_audio_features[,'loudness_reg_imp']), horizontal = T)
fit <- lm(loudness~energy+acousticness, data=df_audio_features)
modelo <- fit$coefficients
df_audio_features$loudness_reg_imp <- df_audio_features$loudness
X <- df_audio_features[df_audio_features$loudness>0, c('energy', "acousticness")]
df_audio_features$loudness_reg_imp[df_audio_features$loudness>0] <- modelo[1]+modelo[2]*X[,1]+modelo[3]*X[,2]
summary(df_audio_features[,c("loudness", "loudness_reg_imp")])
summary(fit)
instrumentalness tiene mucho sesgo la variable. Se va a recurrir a una logaritmización de la variable, previa transformación del dominio, haciendo que los valores que son 0, sean en realidad 0.0000001
logaritmo_ajustado = function(x,delta){
if (x==0.0){
return(log(x+delta, base = 10))
}else{
return(log(x, base = 10))
}
}
delta <- 10^(-6)
df_audio_features$instrumentalness_logadjust <- unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta)))
par(mfrow =c(2,2))
hist(df_audio_features$instrumentalness, main="insrumentalness", xlab="")
hist(unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta))), main='instrumentalness_logadjust', ylim = c(0,130500), xlab = "")
boxplot(df_audio_features$instrumentalness, main="", horizontal = T)
boxplot(unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta))), main="", horizontal=T)
# hist(log(1/sqrt(df_audio_features$instrumentalness+0.00001)),main='log(sqrt(x+))', ylim=c(0,130500), xlab = "")
¿Es útil esta transformación?
delta <- 10^(-6)
df_audio_features$instrumentalness_logadjust <- unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta)))
df_chart_tojoin <- df_charts[,c("Track_Name", "Artist", "URL")]
df_chart_tojoin$isinchart <- 1
df_audio_features_tojoin <- df_audio_features[, c("track_name","artist_key","external_urls_spotify","instrumentalness", "instrumentalness_logadjust")]
join_histogram <- df_audio_features_tojoin %>%
dplyr::select("track_name","artist_key","external_urls_spotify","instrumentalness", "instrumentalness_logadjust") %>%
left_join( df_chart_tojoin %>%
select("Track_Name", "Artist", "URL","isinchart"),
by = c(
"track_name" = "Track_Name",
"artist_key" ="Artist",
"external_urls_spotify" = "URL"))
join_histogram$isinchart[is.na(join_histogram$isinchart)] <- 0
join_histogram$isinchart <- factor(join_histogram$isinchart)
h11 <- hist(join_histogram[join_histogram$isinchart==1,'instrumentalness'])
h11$density <- h11$counts/sum(h11$counts)*100
h12 <- hist(join_histogram[join_histogram$isinchart==0,'instrumentalness'])
h12$density <- h12$counts/sum(h12$counts)*100
h21 <- hist(join_histogram[join_histogram$isinchart==1,'instrumentalness_logadjust'])
h21$density <- h21$counts/sum(h21$counts)*100
h22 <- hist(join_histogram[join_histogram$isinchart==0,'instrumentalness_logadjust'])
h22$density <- h22$counts/sum(h22$counts)*100
#png("C:/Users/Asus/Desktop/DATA SCIENCE/MAESTRIA/Data Mining/TP/graficos/instrumentalness.png",
# width = 800, height = 800)
par(mfrow = c(3,2))
plot(h11, main='instrumentalness \nchart', xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h12, main='instrumentalness \nfuera chart', xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h21, main ="instrumentalness_log \nchart", xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h22, main ="instrumentalness_log \nfuera chart", xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
boxplot(join_histogram[join_histogram$isinchart==1,'instrumentalness_logadjust'], main="instrumentalness_log chart", horizontal = T)
boxplot(join_histogram[join_histogram$isinchart==0,'instrumentalness_logadjust'], main="instrumentalness_log fuera chart", horizontal = T)
#dev.off()
################################
## FILTRAMOS OUTLIERS POR Z-SCORE para 'danceability', 'tempo', 'valence'
##############################
#z-score para variables que tienden a la normal
#filtro features numericos
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
df_audio_features_zscore_media <- df_audio_features[,features_continuas_media]
#normalizo z score con las variables que tienden a la normal
zscore_cols <- c()
for(col in names(df_audio_features_zscore_media)){
name_col <- paste('zscore_',col, sep = "")
zscore_cols <- append(zscore_cols, name_col)
media <- mean(df_audio_features_zscore_media[,col])
stdv <- sd(df_audio_features_zscore_media[,col])
df_audio_features_zscore_media[,name_col] <- (df_audio_features_zscore_media[,col] - media)/stdv
}
par(mfrow=c(1,length(zscore_cols)))
lapply(zscore_cols, function(col) boxplot(df_audio_features_zscore_media[,col],xlab=col))
Danceability
#variable: danceability
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_danceability> umbral_zscore) | (df_audio_features_zscore_media$zscore_danceability< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, danceability ) %>%
arrange(-danceability)
Tempo
#variable: Tempo
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_tempo> umbral_zscore) | (df_audio_features_zscore_media$zscore_tempo< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, tempo ) %>%
arrange(-tempo)
Valence
#variable: valence
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_valence> umbral_zscore) | (df_audio_features_zscore_media$zscore_valence< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, valence ) %>%
arrange(-valence)
################################
## FILTRAMOS OUTLIERS POR Z-SCORE MODIFICADO para 'acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets'
##############################
features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')
df_audio_features_zscore_mediana <- df_audio_features[,features_continuas_mediana]
zscoremodif_cols <- c()
for(col in names(df_audio_features_zscore_mediana)){
name_col <- paste('zscoremodif_',col, sep = "")
zscoremodif_cols <- append(zscoremodif_cols, name_col)
med = median(df_audio_features_zscore_mediana[,col], na.rm = T)
MAD = median(abs(df_audio_features_zscore_mediana[,col] - med), na.rm = T)
df_audio_features_zscore_mediana[, name_col] <- 0.6745 * (df_audio_features_zscore_mediana[,col] - med) / MAD
}
par(mfrow=c(4,2))
lapply(zscoremodif_cols, function(col) boxplot(df_audio_features_zscore_mediana[,col],xlab=col, horizontal = T))
Revisión Variable Instrumentalness
instrumentalness <- c("instrumentalness", "zscoremodif_instrumentalness")
x <- df_audio_features$instrumentalness
n_interv <- 10
intervalos <- round(seq(0,max(x),by=(max(x)-min(x))/n_interv),2)
labs <- c()
for (i in 1:n_interv){
lab <- paste(intervalos[i],intervalos[i+1], sep='\n')
labs <- append(labs, lab)
}
bins <- cut(x, n_interv, include.lowest = TRUE, labels = labs)
barplot(table(bins))
Hacemos K-means para poder discretizar la variable.
sse <- c()
for (k in 2:6){
clusters <- kmeans(df_audio_features$instrumentalness,centers = k, iter.max = 10, nstart = k)
sse <- append(sse, clusters$tot.withinss)
}
plot(2:6,sse, type = 'l', xlab='Cantidad de Clusters', ylab='Suma Error Cuadrático')
#k=3
clusters3 <- kmeans(df_audio_features$instrumentalness,centers = 3, iter.max = 10, nstart = 3)
df_audio_features$clusters <- factor(clusters3$cluster)
lev <- levels(df_audio_features$clusters)
labs <- c()
for (i in lev){
min <- min(df_audio_features$instrumentalness[df_audio_features$clusters==i])
max <- max(df_audio_features$instrumentalness[df_audio_features$clusters==i])
lab <- paste(min,max, sep=' - ')
labs <- append(labs, lab)
}
labs
# barplot(table(factor(clusters3$cluster)), labels = labs)
#prueba igal de transformacion y test de normalidad
join_audio_charts[1:5,"acousticness"]^2
library(nortest)
log10(df_chart_w_lyrics$acousticness)
for (i in features_continuas){
x <- log10(df_chart_w_lyrics[,i])
x <- shapiro.test(x)
z <- x$p.value
print(z)
}
# lyrics = mongo(collection = "lyrics", db = "spotify_dm" )
# df_lyrics <- lyrics$find('{}')
#
# write.csv(df_lyrics, "data/df_lyrics.csv")
df_lyrics <- read.csv("data/df_lyrics.csv") %>%
select(-X)
df_lyrics_unicas <- df_lyrics %>%
distinct(artist_name, track_name, lyrics)
#filtro de idioma
spa_lyrics = df_lyrics_unicas[textcat(df_lyrics_unicas$lyrics)=="spanish",]
Error in textcat(df_lyrics_unicas$lyrics) :
no se pudo encontrar la función "textcat"
# comentar y descomentar según se elija un dataframe u otro
# df_lyrics_seleccionado = df_lyrics_unicas
df_lyrics_seleccionado = en_lyrics
corpus = Corpus(VectorSource(enc2utf8(df_lyrics_seleccionado$lyrics)))
# Eliminamos espacios
corpus.pro <- tm_map(corpus, stripWhitespace)
inspect(corpus.pro[1])
# Elimino todo lo que aparece antes del primer []
corpus.pro <- tm_map(corpus.pro, content_transformer(
function(x) sub('^.+?\\[.*?\\]',"", x)))
# inspect(corpus.pro[1])
# Elimino las aclaraciones en las canciones, por ejemplo:
# [Verso 1: Luis Fonsi & Daddy Yankee]
corpus.pro <- tm_map(corpus.pro, content_transformer(
function(x) gsub('\\[.*?\\]', '', x)))
# Elimino todo lo que aparece luego de 'More on Genius'
corpus.pro <- tm_map(corpus.pro, content_transformer(function(x) gsub("More on Genius.*","", x)))
# Convertimos el texto a minúsculas
corpus.pro <- tm_map(corpus.pro, content_transformer(tolower))
# removemos números
corpus.pro <- tm_map(corpus.pro, removeNumbers)
# Podemos agregar palabras a las stopwords
# my_stopwords <- append(stopwords("spanish"), 'palabra')
my_stopwords <- append(stopwords("english"), c('yeah', "aint", "get", "got"))
# Removemos palabras vacias
corpus.pro <- tm_map(corpus.pro, removeWords, stopwords("english"))
corpus.pro <- tm_map(corpus.pro, removeWords, my_stopwords)
# corpus.pro <- tm_map(corpus.pro, removeWords, stopwords("spanish"))
# inspect(corpus.pro[1])
# Removemos puntuaciones
corpus.pro <- tm_map(corpus.pro, removePunctuation)
# Removemos todo lo que no es alfanumérico
corpus.pro <- tm_map(corpus.pro, content_transformer(function(x) str_replace_all(x, "[[:punct:]]", " ")))
# En tm_map podemos utilizar funciones prop
library(stringi)
replaceAcentos <- function(x) {stri_trans_general(x, "Latin-ASCII")}
corpus.pro <- tm_map(corpus.pro, replaceAcentos)
# Eliminamos espacios que se van generando con los reemplazos
corpus.pro <- tm_map(corpus.pro, stripWhitespace)
#funciones
#funcion para corregir palabras
decontracted = function(txt){
txt = gsub("won't", "will not", txt)
txt = gsub("\\'s", " is", txt)
txt = gsub("\\'t", " not", txt)
txt = gsub("\\'ll", " will", txt)
txt = gsub("\\'m", " am", txt)
txt = gsub("\\'re", " are", txt)
txt = gsub("\\'d", " had", txt)
txt = gsub("\\'ve", " have", txt)
txt = gsub("couldn", "could", txt)
txt = gsub("don", "do", txt)
txt = gsub("doesn", "does", txt)
txt = gsub("isn", "is", txt)
txt = gsub("mustn", "must", txt)
txt = gsub("shouldn", "should", txt)
txt = gsub("wasn", "was", txt)
txt = gsub("\\'cause", " because", txt)
txt = gsub("\\'", "g", txt)
return(txt)
}
#Función para limpiar.
text_cleaning = function(txt, stop=FALSE, language){
txt = sub('^.+?\\[.*?\\]',"", txt) #ok
txt = sub("More on Genius.*","", txt)
txt = gsub('\\[.*?\\]', '', txt)
txt = gsub("\\n"," ", txt)
txt = gsub("[()]", " ", txt)
txt = tolower(txt)
txt = decontracted(txt)
txt = gsub("\\W+\\b", " ", txt)
txt = gsub("\\d", " ", txt)
stopwords_regex = paste(stopwords('en'), collapse = '\\b|\\b')
stopwords_regex = paste0('\\b', stopwords_regex, '\\b')
txt = stringr::str_replace_all(txt, stopwords_regex, '')
my_stopwords <- c('ooh', 'yeah', "aint", "get", "got", "ayy")
txt = stringr::str_replace_all(txt, my_stopwords, '')
txt = str_trim(txt)
txt = gsub("\\n"," ", txt)
if(language == "en"){
return(txt)
}else if (language == "es"){
txt <- function(x) {stri_trans_general(x, "Latin-ASCII")}
return(txt)
}else{
return("Falta definir lenguaje")
}
}
#función para obtener oraciones de una sola palabra.
one_word_setences = function(txt){
return(gsub("\\W+\\b", ". ", txt))
}
#limpio las letras en ingles
en_lyrics$lyrics = text_cleaning(en_lyrics$lyrics, language = "en")
longer object length is not a multiple of shorter object length
head(en_lyrics$lyrics, 1)
[1] "look good together reason watch night long yeah know will turn heads forever tonight gonna show walking watch whole room change baby baby play blame confidenceoh blame measurements shut shit sight right dripping finesse make sense dripping finesse know know dripping finesse make sense dripping finesse know know now slow baby love way feels grind yeah connection magnetic floor nothing can stop us tonight walking watch whole room change baby baby play blame confidenceoh blame measurements shut shit sight right dripping finesse make sense dripping finesse know know dripping finesse make sense dripping finesse know know fellas grab ladies lady fine tell one one life woo ladies grab fellas let right right one like mind ow yeah got going got going feel good us ayy yeah got going got going yeah girl got going yeah got going got going hey feel good us ayy feels good yeah got going got going dripping finesse dripping make sense dripping finesse know know dripping finesse baby make sense dripping finesse know know yeah know got going yeah got going got going feel good us ayy yeah got going got going know know yeah got going got going girl got feel good us ayy yeah got going got going know know"
#Diccionario español
malas_palabras_1 <- read_csv("data/malas_palabras.txt",
col_names = FALSE)
malas_palabras_2 <- read_csv("data/malas_palabras_translate.txt",
col_names = FALSE)
malas_palabras_3 <- read_csv("data/malas_palabras_wiki.txt",
col_names = FALSE) %>%
select(X1)
malas_palabras_4 <- read_csv("data/palabras_profanas_es.txt",
col_names = FALSE)
malas_palabras <- rbind(malas_palabras_1, malas_palabras_2,
malas_palabras_3, malas_palabras_4)
#Función para limpiar.
text_cleaning_esp = function(txt, stop=FALSE){
txt = sub('^.+?\\[.*?\\]',"", txt) #ok
txt = sub("More on Genius.*","", txt)
txt = gsub('\\[.*?\\]', '', txt)
txt = gsub("\\n"," ", txt)
txt = gsub("[()]", " ", txt)
txt = tolower(txt)
# txt = decontracted(txt)
txt = gsub("\\W+\\b", " ", txt)
txt = gsub("\\d", " ", txt)
txt = str_trim(txt)
# txt = stri_trans_general(txt, "Latin-ASCII")
return(txt)
}
malas_palabras$limpias = text_cleaning(malas_palabras$X1)
malas_palabras
malas_palabras %>% filter(startsWith(limpias, "g"))
bad_words <- c()
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_zac_anger)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_alvarez)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_arr_bad)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_racist)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_banned)))
bad_words <- unique(bad_words)
contar_bad_words <- function(x){
x <- profanity(x,profanity_list = bad_words)
q <- sum(x$profanity_count)
return (q)
}
df_chart_w_lyrics$cant_bad_words <- sapply(df_chart_w_lyrics[,"lyrics"], contar_bad_words)
df_chart_w_lyrics_only_explicit <- df_chart_w_lyrics[df_chart_w_lyrics$explicit==TRUE & df_chart_w_lyrics$cant_bad_words > 0, ]
hist(df_chart_w_lyrics_only_explicit$cant_bad_words)
#creo vars categóricas
df_chart_w_lyrics_only_explicit$nivel_puteada <- cut(df_chart_w_lyrics_only_explicit$cant_bad_words, breaks = c(0,10,20,50,Inf), labels=c("bajo","poco","alto","muy_alto"))
df_chart_w_lyrics_only_explicit$nivel_ranking <- cut(df_chart_w_lyrics_only_explicit$position_avg, breaks = c(1,100,Inf), labels=c("1a100","100a200"))
df_chart_w_lyrics_only_explicit$nivel_popularidad <- cut(sqrt(df_chart_w_lyrics_only_explicit$cant_bad_words), breaks = c(0,10,20,50,Inf), labels=c("bajo","poco","alto","muy_alto"))
transactions <- as(as.data.frame(apply(df_chart_w_lyrics_only_explicit, 2, as.factor)), "transactions")
rules = apriori(transactions, parameter=list(target="rules", confidence=0.25, support=0.1))
rules.sub <- subset(rules, subset = lhs %pin% "nivel_puteada" & rhs %pin% "nivel_ranking")
inspect(head(sort(rules.sub, by = "lift", decreasing = TRUE),10))
# discretizacion continuas y seleccion de variables
# identificar palabras explít
¿Qué características tienen las canciones que están en el chart? ¿Cual es el patrón comun que tienen las canciones más escuchadas? (ver dispersiones, media, grafico comparativo)
#funcion para escalar variable
scale_vble <- function(x){
(x - mean(x, na.rm = T))/sd(x, na.rm = T)
}
#anti_join
anti_join_audio_charts <- df_audio_features %>%
select("artist_name","artist_all", "artist_key",
"track_name", "external_urls_spotify", "album_name", "album_release_year",
all_of(features_continuas), all_of(features_categoricas)) %>%
anti_join( df_charts %>%
select( "Track_Name", "Artist", "URL"),
by = c("external_urls_spotify" ="URL",
"artist_key" ="Artist" ))
# by = c("track_name" = "Track_Name"))
anti_join_audio_charts_complete <- na.omit(anti_join_audio_charts)
anti_join_audio_charts_complete_scale <- anti_join_audio_charts_complete %>%
distinct() %>%
select(features_continuas) %>%
mutate_all(scale_vble)
nrow(anti_join_audio_charts_complete_scale)
join_audio_charts %>%
group_by(artist_name) %>%
dplyr::summarise(n = n()) %>%
arrange(-n)
join_audio_charts %>%
group_by(track_name, artist_name,external_urls_spotify) %>%
dplyr::summarise(n = n()) %>%
arrange(-n) %>%
select(track_name, n, everything(.))
# cantidad de semanas que estuvieron en el chart
df_charts %>%
mutate(week_start=as.Date(week_start),
week_end = as.Date(week_end),
week_year = (year(week_start))) %>%
arrange(Artist, Track_Name) %>%
group_by(Artist, Track_Name, URL) %>%
dplyr:: summarise( day_in = min(week_start),
year_in = year(day_in),
day_max = max(week_end),
year_max = year(day_max),
duracion_chart_dias = day_max-day_in,
duracion_chart_anio = year_max - year_in) %>%
arrange(Artist)